Load Data and Preprocess

Climate DF

Mobility DF

Transit DF

Collated Data

Descriptive Analysis

Transit

Transit Score® Description
90–100 Rider’s Paradise
World-class public transportation.
70–89 Excellent Transit
Transit is convenient for most trips.
50–69 Good Transit
Many nearby public transportation options.
25–49 Some Transit
A few nearby public transportation options.
0–24 Minimal Transit
It is possible to get on a bus.
ggplot(clean.transit.data, aes(x = z.transit)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Standardised transit scores in America",
       x = "Transit Scores",
       y = "Frequency") +
  bbplot::bbc_style()+
  theme(plot.title = element_text(size = 12),
        axis.title.x = element_text(size = 8), # Adjust the x-axis label size
    axis.title.y = element_text(size = 8))

ggplot(transit.map) +
  geom_sf(mapping = aes(fill = z.transit), colour = NA, inherit.aes = FALSE) +
  scale_fill_gradient2(
    low = "#2c7bb6",           # Blue
    mid = "#ffffbf",           # Yellow (midpoint)
    high = "#d7191c",          # Red
    midpoint = mean(transit.map$z.transit, na.rm = TRUE),
    limits = c(min(transit.map$z.transit, na.rm = TRUE),
               max(transit.map$z.transit, na.rm = TRUE)),
    label = scales::comma,
    na.value = "grey"
  ) +
  labs(title = 'Standardised Transit Distribution') +
  guides(fill = guide_legend(keyheight = 0.3)) +
  theme_void() +
  bbplot::bbc_style() +
  theme(
    plot.title = element_text(size = 16),         # Make the title larger
    legend.position = 'bottom',
    legend.title = element_blank(),
    legend.text = element_text(size = 10),        # Make legend text smaller
    axis.title = element_text(size = 10),         # Make axis labels smaller
    axis.text = element_text(size = 8)            # Make axis text smaller
  )

Mobility

ggplot(transit.mobility.data, aes(x = date, y = rolling_avg)) +
  geom_line(color = "blue") +
  labs(title = "Changes in Visitors to Transit Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)") +
  theme_minimal()

ggplot(workplace.mobility.data, aes(x = date, y = rolling_avg)) +
  geom_line(color = "blue") +
  labs(title = "Changes in Visitors to Workplaces Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)") +
  theme_minimal()

# Add a type column to each dataframe
grocery.mobility.data <- grocery.mobility.data %>%
  mutate(type = "Grocery and Pharmacy")

transit.mobility.data <- transit.mobility.data %>%
  mutate(type = "Transit")

workplace.mobility.data <- workplace.mobility.data %>%
  mutate(type = "Workplace")

# Combine the dataframes into one
combined_mobility_data <- bind_rows(grocery.mobility.data, transit.mobility.data, workplace.mobility.data)

# Create the ggplot
p <- ggplot(combined_mobility_data, aes(x = date, y = rolling_avg, color = type)) +
  geom_line() +
  labs(title = "Changes in Mobility Over Time",
       x = "Date",
       y = "7-day Rolling Average of Mobility (%)",
       color = "Mobility Type") +
  theme_minimal()

# Convert the ggplot to an interactive plotly plot
interactive_plot <- ggplotly(p)

# Display the interactive plot
interactive_plot

Climate data

# Create the plot using ggplot
p <- ggplot(climate.state, aes(x = month, y = avg_temp, group = state, color = state)) +
  geom_line() +
  geom_point() +
  labs(title = "Average Temperature Over Each Month Per State",
       x = "Month",
       y = "Average Temperature") +
  theme_minimal()

# Convert the ggplot object to plotly
p <- ggplotly(p, tooltip = c("x", "y", "color"))

# Show the interactive plot
p
month_order <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")

# Convert the month variable to a factor with the desired order
climate.map$month <- factor(climate.map$month, levels = month_order)

ggplot(climate.map) +
  geom_sf(mapping = aes(fill = temp), colour = NA, inherit.aes = FALSE) +
  scale_fill_gradient2(low = "#0072B2",
                       high = "red",
                       midpoint = mean(climate.map$temp, na.rm = TRUE),
                       limits = c(min(climate.map$temp, na.rm = TRUE),
                                  max(climate.map$temp, na.rm = TRUE)),
                       label = scales::comma,
                       na.value = "grey") +
  labs(title = 'Average Climate distribution') +
  guides(fill = guide_legend(keyheight = 0.3)) +
  theme_void() +
  theme(plot.title = element_text(size = 7),
        legend.position = 'bottom',
        legend.title = element_blank()) +
  facet_wrap(~month) # This line tells ggplot to create separate panels for each unique value in the month column. Each panel will display the map for one month.

Population Density: Urban-Rural Separation

The density per county, which measures the number of people living in a given area within a county, is an important factor in the spread of infectious diseases like COVID-19. Higher density means more people are concentrated in a smaller area, increasing the likelihood of person-to-person transmission through close contact. By analysing density per county, we can assess how different population concentrations impact the rate of new infections. This information can help identify areas that might need more stringent public health measures to control the spread of the virus.

As seen below area, with the highest areas tend to have higher number of case of covid.

ggplot(data = county_data_geo) +
      geom_sf(aes(fill = log(Density.per.square.mile.of.land.area...Population)), color = NA) +
      scale_fill_viridis_c(option = "viridis") +
      theme_minimal() +
      labs(title = "Population Density by County Standardised", 
           fill = "Density (per sq. mile)") +
  geom_point(data = top_counties, aes(x = long, y = lat, size = new_cases_sum), color = "red", alpha = 0.7) +
  scale_size_continuous(name = "Summed New Cases") +
  theme(legend.position = "right", axis.title.x = element_blank(),  # Remove x-axis title
        axis.title.y = element_blank())

Additionally, the county areas below show that within the 20 most dense counties, there is an intersection counties with the highest case number

Density_Cross$Area_Name
## [1] "New York County" "Baltimore city"

Additionally, this extend beyond case numbers and is also true for deaths in counties.

Density_Cross_deaths$Area
## [1] "Kings County"  "Queens County" "Cook County"

Age Demographics: Vulnerability of Elderly

The age distribution of a population, particularly the proportion of individuals over 65, is essential in studying COVID-19 spread and impact. Older adults are more susceptible to severe illness and complications from COVID-19. They are also more likely to require hospitalisation and intensive care. By examining the share of the population over 65, we can assess the potential burden on healthcare systems and identify regions where protective measures and vaccination campaigns might need to be prioritised to protect this vulnerable group.

Seen below, a visual trend of areas with a higher proportion of 65+ indicates somewhat higher scores of covid cases. For example areas like Florida that historically have high numbers of retirees. However, the is no intersection between the county areas with the highest proportion of 65+ and covid cases.

ggplot(data = county_data_geo) +
  geom_sf(aes(fill = log(Total_age85plusr/POP_ESTIMATE_2018)), color = NA) +
  scale_fill_viridis_c(option = "viridis") +
  theme_minimal() +
  labs(title = "Proportion of Population Age 65+ by County Standardised",
       fill = "Proportion") +
  geom_point(data = top_counties, aes(x = long, y = lat, size = new_cases_sum), color = "red", alpha = 0.7) +
  scale_size_continuous(name = "Summed New Cases") +
  theme(legend.position = "right", axis.title.x = element_blank(),  # Remove x-axis title
        axis.title.y = element_blank())

Healthcare Resources: Active Physicians per capita

The availability of healthcare resources, such as active physicians per 100,000 population, is a critical factor in managing and mitigating the effects of the COVID-19 pandemic. Areas with a higher number of active physicians can provide better medical care, testing, and treatment, which can help control the spread of the virus and reduce mortality rates. By analysing the distribution of active physicians, we can understand the capacity of different regions to respond to the pandemic and highlight areas that may need additional support and resources to effectively manage the healthcare demands caused by COVID-19.

However, the is no intersection between the county areas with the highest proportion of physicians and covid cases.

# Plot Active Physicians per 100,000
ggplot(data = county_data_geo) +
  geom_sf(aes(fill = log(Total.Active.Patient.Care.Physicians.per.100000.Population.2018..AAMC.)), color = NA) +
  scale_fill_viridis_c(option = "viridis") +
  theme_minimal() +
  labs(title = "Active Physicians per 100,000 by County Standardised",
       fill = "Physicians/100k") +
  geom_point(data = top_counties, aes(x = long, y = lat, size = new_cases_sum), color = "red", alpha = 0.7) +
  scale_size_continuous(name = "Summed New Cases") +
  theme(legend.position = "right", axis.title.x = element_blank(),  # Remove x-axis title
        axis.title.y = element_blank())

Active physicians, may be a confounding factor with population density, as higher dense city-areas may attract more physicians. And as there is low evidence above that increased physicians means less cases, it demonstrates that density is a more telling factor of cases then physicians.

Statistical Analysis

This statement that population density is more significant to contributing to case numbers can be proved statistically.

county_cases_sum_df <- as.data.frame(county_cases_sum)

county.covid_df <- as.data.frame(county.covid)
county.covid_df <- na.omit(county.covid_df)
county.covid_df <- county.covid_df %>%
  select(GEOID, pop.density, age.65.plus, active.patient.care.physicians)
model_df = left_join(county_cases_sum_df, county.covid_df, by = 'GEOID')

model_df <- na.omit(model_df)

model_df <- model_df %>%
  distinct(GEOID, .keep_all = TRUE)

model_proption65 = lm(new_cases_sum ~ pop.density + age.65.plus + active.patient.care.physicians, model_df)
# Tidy the model output
summary(model_proption65)
## 
## Call:
## lm(formula = new_cases_sum ~ pop.density + age.65.plus + active.patient.care.physicians, 
##     data = model_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27237  -4736  -1631   1343 733266 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.809e+04  5.732e+03   4.900 1.03e-06 ***
## pop.density                     1.420e+00  4.452e-01   3.190  0.00144 ** 
## age.65.plus                    -1.598e+04  1.722e+04  -0.928  0.35356    
## active.patient.care.physicians  2.214e+00  2.317e+01   0.096  0.92386    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 33860 on 2123 degrees of freedom
## Multiple R-squared:  0.005681,   Adjusted R-squared:  0.004276 
## F-statistic: 4.043 on 3 and 2123 DF,  p-value: 0.007056

As seen above, compared to the other two variables, population density is most siginificant to indicating new cases.

Below is an interactive graph that demonstrates displays the top 100 counties and their new covid cases over 2020 - 2021. It can be identified visually the different Covid-waves/influxes in cases.

fig <- plot_ly(county.covid_long_top_df,
               x = ~long,
               y = ~lat,
               frame = ~year_month,
               ids = ~GEOID,
               type = 'scattergeo',
               mode = 'markers',
               text = ~county.x,        # Set the hover text to the county name
               hoverinfo = 'text',    # Only display the text (county name) in the hover info
               marker = list(
                 size = ~log(new.cases), # Set the size of the markers based on the normalized new cases
                 color = 'red',        # Optional: Set a constant color for the markers
                 opacity = 0.6,
                 line = list(width = 0)
               ),
               name = 'COVID-19 Cases') %>%
  layout(
    title = "New COVID-19 Cases by County in 2021 for Highest Reporting Counties",
    geo = list(
      scope = 'usa',               # Set the map scope to the USA
      projection = list(type = 'albers usa'),
      showland = TRUE,
      landcolor = 'rgb(217, 217, 217)',
      subunitwidth = 1,
      countrywidth = 1,
      subunitcolor = 'rgb(255, 255, 255)',
      countrycolor = 'rgb(255, 255, 255)'
    ),
    margin = list(l = 0, r = 0, t = 30, b = 0) # Adjust the margins if needed
  ) %>%
  animation_opts(frame = 1000, easing = "elastic", redraw = TRUE) %>%
  animation_button(x = 1, xanchor = "right", y = 0, yanchor = "bottom")
# Render the plotly object

fig

Model

“county_state” “date” “new.cases”
[4] “cases_14days” “county” “state”
[7] “FIPS” “total.population” “pop.density”
[10] “age.65.plus” “avg.dec.temp” “avg.jan.temp”
[13] “avg.feb.temp” “avg.mar.temp” “avg.apr.temp”
[16] “avg.may.temp” “avg.jun.temp” “avg.jul.temp”
[19] “avg.aug.temp” “avg.sep.temp” “avg.oct.temp”
[22] “avg.nov.temp” “icu.beds” “active.physicians”
[25] “active.patient.care.physicians” “housing.density” “transit.scores”
[28] “mobility.transit” “mobility.workplaces” “mobility.grocery”
[31] “new.cases.lag” “workplaces.lag14”

model2 <- lm(cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp , covid.data)

summary(model2)
## 
## Call:
## lm(formula = cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp + 
##     avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + 
##     avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + 
##     avg.dec.temp, data = covid.data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -788    -56    -29      7  38704 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -397.00403    4.06516  -97.66   <2e-16 ***
## avg.jan.temp    1.49125    0.11727   12.72   <2e-16 ***
## avg.feb.temp   -7.73632    0.08672  -89.21   <2e-16 ***
## avg.mar.temp    6.24124    0.15992   39.03   <2e-16 ***
## avg.apr.temp    4.16294    0.17651   23.59   <2e-16 ***
## avg.may.temp   -5.82414    0.11941  -48.77   <2e-16 ***
## avg.jun.temp  -24.30396    0.18125 -134.09   <2e-16 ***
## avg.jul.temp   26.63539    0.18637  142.92   <2e-16 ***
## avg.aug.temp   -8.48234    0.16586  -51.14   <2e-16 ***
## avg.sep.temp    2.30451    0.12589   18.31   <2e-16 ***
## avg.oct.temp    8.02431    0.11472   69.95   <2e-16 ***
## avg.nov.temp    9.33308    0.17684   52.78   <2e-16 ***
## avg.dec.temp   -3.31985    0.12168  -27.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 240.6 on 2379966 degrees of freedom
##   (10961 observations deleted due to missingness)
## Multiple R-squared:  0.03495,    Adjusted R-squared:  0.03495 
## F-statistic:  7183 on 12 and 2379966 DF,  p-value: < 2.2e-16
model3 <- lm(cases_14days ~mobility.transit +mobility.grocery +mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp, covid.data)
summary(model3)
## 
## Call:
## lm(formula = cases_14days ~ mobility.transit + mobility.grocery + 
##     mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp + 
##     avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + 
##     avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + 
##     avg.dec.temp, data = covid.data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -799    -74    -30     18  38597 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -561.34293    6.63602  -84.59   <2e-16 ***
## mobility.transit      -1.86760    0.01204 -155.09   <2e-16 ***
## mobility.grocery      -0.67687    0.01799  -37.63   <2e-16 ***
## mobility.workplaces   -0.91827    0.01813  -50.65   <2e-16 ***
## avg.jan.temp           4.81217    0.20528   23.44   <2e-16 ***
## avg.feb.temp         -10.46979    0.15244  -68.68   <2e-16 ***
## avg.mar.temp           8.94395    0.26505   33.74   <2e-16 ***
## avg.apr.temp          -3.75114    0.30867  -12.15   <2e-16 ***
## avg.may.temp          -6.13216    0.19875  -30.85   <2e-16 ***
## avg.jun.temp         -24.58477    0.29336  -83.81   <2e-16 ***
## avg.jul.temp          31.89003    0.32071   99.44   <2e-16 ***
## avg.aug.temp         -13.60139    0.30398  -44.74   <2e-16 ***
## avg.sep.temp          10.87912    0.22105   49.22   <2e-16 ***
## avg.oct.temp           4.75981    0.19643   24.23   <2e-16 ***
## avg.nov.temp          10.11356    0.31310   32.30   <2e-16 ***
## avg.dec.temp          -2.65084    0.21474  -12.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 300 on 1443118 degrees of freedom
##   (947806 observations deleted due to missingness)
## Multiple R-squared:  0.06917,    Adjusted R-squared:  0.06916 
## F-statistic:  7149 on 15 and 1443118 DF,  p-value: < 2.2e-16
library(lmtest)

# Durbin-Watson test
dwtest(model3)
## 
##  Durbin-Watson test
## 
## data:  model3
## DW = 0.016454, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# Plot residuals vs fitted values
plot(fitted(model3), residuals(model3), main = "Residuals vs Fitted")
abline(h = 0, col = "red")

# Q-Q plot
qqnorm(residuals(model3))
qqline(residuals(model3), col = "red")

Results and data journalism project explaining the spread of COVID-19 in the US. Why has it impacted some